{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n"
     ]
    }
   ],
   "source": [
    "from __future__ import print_function, division\n",
    "\n",
    "import os\n",
    "import os.path\n",
    "import pandas as pd\n",
    "from io import StringIO\n",
    "import io\n",
    "\n",
    "import numpy as np\n",
    "from numpy import array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEQUENCE_LENGTH = 40\n",
    "SEQUENCE_LENGTH_D = 25\n",
    "#read train and val data\n",
    "fpath = 'data/TOEFL'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "SEQUENCE_LEN_D = SEQUENCE_LENGTH_D\n",
    "SEQUENCE_LEN = SEQUENCE_LENGTH\n",
    "\n",
    "df_train = pd.read_csv(os.path.join(fpath, 'train.csv'))\n",
    "text = df_train['text1']\n",
    "df_val = pd.read_csv(os.path.join(fpath,'test.csv'))\n",
    "\n",
    "text_val = df_val['text1']\n",
    "text_train = df_train['text1']\n",
    "rank_val = df_val['label']\n",
    "rank_train = df_train['label']\n",
    "\n",
    "target_val = np.array(rank_val)\n",
    "target_train = np.array(rank_train)\n",
    "\n",
    "onehot_encoder = OneHotEncoder(sparse=False)\n",
    "\n",
    "integer_encoded = target_train.reshape(len(target_train), 1)\n",
    "y_train = onehot_encoder.fit_transform(integer_encoded)\n",
    "\n",
    "integer_encoded_val = target_val.reshape(len(target_val), 1)\n",
    "y_test = onehot_encoder.fit_transform(integer_encoded_val)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/bert_serving/client/__init__.py:286: UserWarning: some of your sentences have more tokens than \"max_seq_len=40\" set on the server, as consequence you may get less-accurate or truncated embeddings.\n",
      "here is what you can do:\n",
      "- disable the length-check by create a new \"BertClient(check_length=False)\" when you do not want to display this warning\n",
      "- or, start a new server with a larger \"max_seq_len\"\n",
      "  '- or, start a new server with a larger \"max_seq_len\"' % self.length_limit)\n"
     ]
    }
   ],
   "source": [
    "### Get bert senence mebeddings for each sentence in an essay\n",
    "## Based on BERT serving client https://pypi.org/project/bert-serving-client/\n",
    "## base uncased model\n",
    "from nltk import sent_tokenize\n",
    "from bert_serving.client import BertClient\n",
    "bc = BertClient()\n",
    "b_len = 768 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = []\n",
    "\n",
    "for i in df_train['text1']:\n",
    "    i = sent_tokenize(i)\n",
    "    X_train.extend(bc.encode(i[:SEQUENCE_LEN_D]))\n",
    "    for k in range(max(SEQUENCE_LEN_D - (len(i)), 0)):\n",
    "        X_train.append([[0]*b_len]*SEQUENCE_LEN) # pad token maps to 0\n",
    "    #break\n",
    "    \n",
    "X_train = np.array(X_train)  \n",
    "\n",
    "np.save('X_train_TOEFL', X_train)\n",
    "np.save('y_train_TOEFL', y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "del X_train\n",
    "del y_train\n",
    "import gc\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/anaconda3/lib/python3.6/site-packages/bert_serving/client/__init__.py:286: UserWarning: some of your sentences have more tokens than \"max_seq_len=40\" set on the server, as consequence you may get less-accurate or truncated embeddings.\n",
      "here is what you can do:\n",
      "- disable the length-check by create a new \"BertClient(check_length=False)\" when you do not want to display this warning\n",
      "- or, start a new server with a larger \"max_seq_len\"\n",
      "  '- or, start a new server with a larger \"max_seq_len\"' % self.length_limit)\n"
     ]
    }
   ],
   "source": [
    "#val set\n",
    "X_test = []\n",
    "\n",
    "for i in df_val['text1']:\n",
    "    i = sent_tokenize(i)\n",
    "    X_test.extend(bc.encode(i[:SEQUENCE_LEN_D]))\n",
    "    for k in range(max(SEQUENCE_LEN_D - (len(i)), 0)):\n",
    "        X_test.append([[0]*b_len]*SEQUENCE_LEN) # pad token maps to 0\n",
    "X_test = np.array(X_test)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "#X_test.shape[0]/SEQUENCE_LEN_D\n",
    "np.save('X_test_TOEFL', X_test)\n",
    "np.save('y_test_TOEFL', y_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
